# Library
import numpy as np
import pandas as pd
# Plotly libraries
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
# Minmax scaler
from sklearn.preprocessing import MinMaxScaler
covid = pd.read_csv('novel-corona-virus-2019-dataset/covid_19_data.csv')
covid_line=pd.read_csv('novel-corona-virus-2019-dataset/COVID19_line_list_data.csv')
titanic = pd.read_csv('titanic/train.csv')
us_counties = pd.read_csv('us-counties-covid-19-dataset/us-counties.csv')
house=pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
netflix=pd.read_csv("netflix-shows/netflix_titles.csv")
world=pd.read_csv('world-university-rankings/cwurData.csv')
google=pd.read_csv("google-play-store-apps/googleplaystore.csv")
user_achieve=pd.read_csv('meta-kaggle/UserAchievements.csv')
user=pd.read_csv('meta-kaggle/Users.csv')
campus=pd.read_csv('factors-affecting-campus-placement/Placement_Data_Full_Class.csv')
covid.head(3)
covid_line.head(3)
titanic.head(3)
us_counties.head(3)
house.head(3)
netflix.head(3)
world.head(3)
google.head(3)
user_achieve.head(3)
user.head(3)
campus[:3]
Plotly express: Functions that can create entire figures at once. It is the recommended starting point for creating most common figures.
Graph objects: The figures created, manipulated and rendered by the plotly Python library are represented by tree-like data structures which are automatically serialized to JSON for redenering the Plotly.jsJavaScript library
Basic elements in layout:
- xaxis_title: Plot x axis label
- yaxis_title: Plot y axis label
- title: Plot title
- title_font_size: Plot title font size
- height: Height of chart
- width: Width of chart
- show_legend: False- Disable legend
- xaxis_type/ yaxis_type: Type of X/Y axis
- xaxis_showgrid/ yaxis_showgrid: Display grids or not
- gridcolor: Color of grid
- gridwidth: Width of grid
Basic elements in axes:
- ticks: 'inside'/'outside' - Zxes tickmark
- nticks: Number of ticks
- tick0: First tick position
- tickwidth: Width of ticks
- tickcolor: Color of ticks
- ticklen: Length of ticks
- showticklabels: Display tick labels or not
- tickangle: Angle of tick labels
- tickfont: Font of tick label ex: dict(family = 'Rockwell', color = 'crimson', size = 14)
- tickprefix: Prefix of tick labels
- showline: Line of chart outline
- linewidth: Line width of chart outline, label = 2
- linecolor: Line color of chart outline, 'black'
- mirror: Opposite side of plotting area
- range: Range limit of axis
Purpose: Relationship between numerical values
Question: How much dependence between SalesPrices and Area?
fig = px.scatter(house, x ='LotArea', y ='SalePrice')
fig.update_layout(title = ' Sales Price vs Area', xaxis_title = 'Area', yaxis_title = 'Price')
fig.show()
Purpose: Relationship between numerical values with a categorical field.
Question: How much dependency between SalesPrices and Area with Shape of plot?
fig = px.scatter(house, x ='LotArea', y ='SalePrice',
color = 'LotShape')
fig.update_layout(title = 'Sales Price vs Area with Shape', xaxis_title ='Area', yaxis_title = 'Price')
fig.show()
Purpose: Relationship between numerical values with addition of categorization by a field and increasing size of datapoint by another numericalfield.
Question: How much dependency between Quality of Education and Students Score for different countries based on number of students?
fig = px.scatter(world, x='quality_of_education', y ='score', color= 'country', size ='citations')
fig.update_layout(title ='Quality of Education vs Score with Country and Students', xaxis_title = 'Quality of Education', yaxis_title = 'Score')
fig.show()
Purpose: Relationship between numerical values exposing most data points by color gradient.
Question: How much relationship between Ratings and Reviews with most occured ratings?
google['Reviews'] = google['Reviews'].str.replace(r'\D', '')
google['Reviews'] = pd.to_numeric(google['Reviews'])
fig = go.Figure(data = go.Scatter(x=google['Rating'],
y = google['Reviews'],
mode = 'markers',
marker_color = google['Reviews'])
)
fig.update_xaxes(range = [0,6])
fig.update_layout(title = 'Playstore Apps - Reviews vs Ratings with gradient', xaxis_title = 'Ratings', yaxis_title = 'Reviews')
fig.show()
Purpose: Relationship between variables with respective to time
Question: How many COVID deaths were observed over time?
total_confirmed = covid[['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()
fig = go.Figure(data =go.Scatter(x=total_confirmed['ObservationDate'], y = total_confirmed['Deaths'], mode = 'lines'))
fig.update_layout(title = 'Number of COVID cases over time', xaxis_title = 'Date', yaxis_title ='Number of cases')
fig.show()
Purpose: Relationship between variables with respective to time.
Question: How many COVID deaths were observed over time for different countries? Different types of line plot for different countries? Different types of line plot for different countries (Dash, Dashdot, Dot)
covid_can = covid[covid['Country/Region'] == 'Canada'][['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()
covid_rus = covid[covid['Country/Region']=='Russia'][['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()
covid_uk = covid[covid['Country/Region'] =='UK'][['ObservationDate', 'Deaths']].groupby('ObservationDate').sum().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(x= covid_can['ObservationDate'], y = covid_can['Deaths'], name = 'Cannada-Dot', line = dict(color = 'royalblue', width = 4, dash = 'dot')))
fig.add_trace(go.Scatter(x= covid_rus['ObservationDate'], y = covid_rus['Deaths'], name = 'Russia-Dashdot', line = dict(color = 'green', width =4, dash = 'dashdot')))
fig.add_trace(go.Scatter(x= covid_uk['ObservationDate'], y = covid_uk['Deaths'], name = 'UK-Dash', line = dict(color = 'brown', width =4, dash = 'dash')))
fig.update_layout(title = 'Number of COVID cases over time for different countries', xaxis_title = 'Date', yaxis_title = 'Number of cases')
fig.show()
Purpose: Displays quantitative representation highlighting the most occured category with the size of bubble.
Question: How many people travlled in each class of titanic? Highlight the most used class
pclass = titanic['Pclass'].value_counts().to_frame().reset_index().rename(columns = {'index' : 'Pclass', 'Pclass' : 'Count'})
fig = go.Figure(data = [go.Scatter(x = pclass['Pclass'], y = pclass['Count'], mode = 'markers', marker = dict(size = pclass['Count']*0.3))])
fig.update_layout(title = 'People travelled in each class of titanic', xaxis_title = 'Class', yaxis_title = 'Number of People')
fig.show()
Purpose: Displays quantitative representation highlighting the most occured category with the color gradient of bubble
Question: How many of each aged category people travelled in titanic? Highlight the most occured age.
titanic=titanic.dropna()
titanic['age_category']=np.where((titanic['Age']<19),"below 19",
np.where((titanic['Age']>18)&(titanic['Age']<=30),"19-30",
np.where((titanic['Age']>30)&(titanic['Age']<=50),"31-50",
np.where(titanic['Age']>50,"Above 50","NULL"))))
age = titanic['age_category'].value_counts().to_frame().reset_index().rename(columns = {'index' :'age_category', 'age_category':'Count'})
fig = go.Figure(data = [go.Scatter(
x= age['age_category'], y=age['Count'], mode = 'markers',
marker = dict(
color = age['Count'],
size = age['Count'],
showscale = True)
)])
fig.update_layout(title = 'Different Age People in Titanic', xaxis_title ='Age Category', yaxis_title = 'Number of People')
fig.show()
Purpose: Displays quantitative representation of variable
Question: How many universities in each country have good score? (filtered for universities with score greater than 60)
top_countries = world[world['score'] > 60]['country'].value_counts().reset_index().rename(columns = {'index' : 'country', 'country':'count'})
fig = go.Figure(data = [go.Bar(
x = top_countries['country'], y = top_countries['count'])])
fig.update_layout(title_text = 'Top Countries with number of Universities score greater than 60', xaxis_title = 'Country', yaxis_title = 'Number of Universities')
fig.show()
Purpose: Displays quantitative representation of a variable highlighting the most counts with color gradient and text position for all bars
Question: In which genre does most of google playstore apps fall? Highlight from top count to low count
apps = google['Genres'].value_counts()[:10].to_frame().reset_index().rename(columns = {'index' : 'Genres', 'Genres':'Count'})
apps.head(3)
fig = go.Figure(go.Bar(
x=apps['Genres'], y=apps['Count'],
marker = {'color':apps['Count'],
'colorscale' : 'Viridis'},
text = apps['Count'],
textposition = 'outside'
))
fig.update_layout(title_text = 'Top Genres Google Playstore Apps', xaxis_title = 'App Genres', yaxis_title = 'Number of Apps')
fig.show()
Purpose: Displays quantitative representation of a variable grouping/stacking the bars
Question: How many shows/movies were released in Netflix by India & United States over past 5 years? (Grouping or Stacking countries)
top_release_india=netflix[(netflix['country']=='India')&
((netflix['release_year']==2015)|(netflix['release_year']==2016)|(netflix['release_year']==2017)|(netflix['release_year']==2018)|
(netflix['release_year']==2019)|(netflix['release_year']==2020))]['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
top_release_us = netflix[(netflix['country']=='United States')&
((netflix['release_year']==2015)|(netflix['release_year']==2016)|(netflix['release_year']==2017)|(netflix['release_year']==2018)|
(netflix['release_year']==2019)|(netflix['release_year']==2020))]['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
fig = go.Figure()
fig.add_trace(go.Bar(x= top_release_india['release_year'], y = top_release_india['count'], name = 'India', marker_color = 'blue'))
fig.add_trace(go.Bar(x=top_release_us['release_year'], y = top_release_us['count'], name = 'United States', marker_color = 'violet'))
fig.update_layout(title_text = 'Netflix shows by India/US over past 5 years', xaxis_title = 'Year', yaxis_title = 'Number of Shows',
barmode = 'stack')
fig.show()
Purpose: Displays collected view of different categorical features with respect to single numerical variable.
Question: How much is the sum of fare for each gender passengers in each class and their embarked? (Facet variables - Survived, Pclass, Grouped Bar - Embarked, Single Numerical (Y axis) - False)
facet_titanic=titanic[['Sex','Survived','Embarked','Pclass','Fare']].groupby(['Sex','Survived','Embarked','Pclass']).agg('sum').reset_index()
facet_titanic.head(2)
fig = px.bar(facet_titanic, x="Sex", y="Fare",color="Embarked",barmode="group",
facet_row="Survived", facet_col="Pclass",
)
fig.update_layout(title_text='Facet view of Titanic passengers Fare with respect to Age,Class,Embarked')
fig.show()
Purpose: Displays quantitative representation of a variable in horizontal manner.
Question: How many playstore apps fall in each categories?
app_category = google['Category'].value_counts()[:15].reset_index().rename(columns = {'index' : 'Category', 'Category':'Count'}).sort_values('Count', ascending = 'False')
app_category[:3]
fig = go.Figure(go.Bar(y=app_category['Category'], x = app_category['Count'], orientation = 'h'))
fig.update_layout(title_text ='Top 15 Google Playstore App Categories', xaxis_title = 'Count', yaxis_title = 'Number of Apps')
fig.show()
Purpose: Displays quantitative representation of a variable in a horizontal manner with dots as categorical feature.
Question: What is average score for different countries i n2014 & 2015?
grouped_df = world[['country', 'year', 'score']].groupby(['country', 'year']).agg('mean').reset_index()
grouped_df_2014 = grouped_df[grouped_df['year'] == 2014][['country', 'score']][:20]
grouped_df_2015=grouped_df[grouped_df['year']==2015][['country', 'score']][:20]
grouped_df_2015[:3]
fig = go.Figure()
fig.add_trace(go.Scatter(
y = grouped_df_2014['country'],
x = grouped_df_2014['score'],
marker = dict(color = 'red', size =12),
mode = 'markers',
name = '2014'))
fig.add_trace(go.Scatter(
y = grouped_df_2015['country'],
x = grouped_df_2015['score'],
marker = dict(color = 'blue', size = 12),
mode = 'markers',
name = '2015'))
fig.update_layout(title = 'Average scores for years - 2014&2015', xaxis_title = 'Score', yaxis_title = 'Country')
fig.show()
Purpose: Displays quantitative representation in pie with label and textinfo
Question: What is split distribution count of netflix program types?
net_category = netflix['type'].value_counts().to_frame().reset_index().rename(columns = {'index':'type', 'type':'count'})
net_category.head(2)
fig = go.Figure(go.Pie(labels = net_category['type'], values = net_category['count']))
fig.update_traces(hoverinfo = 'label+percent', textinfo = 'value+percent', textfont_size = 15, insidetextorientation = 'radial')
fig.update_layout(title = 'Netflix Show Types', title_x = 0.5)
fig.show()
Purpose: Displays quantitaive representation in pie with custom colors to labels
Question: What is split distribution count of titanic age categories? Highlight each category with different color.
titanic_age = titanic['age_category'].value_counts().to_frame().reset_index().rename(columns = {'index' : 'age_category', 'age_category':'count'})
titanic_age
colors = ['red', 'green', 'yellow', 'blue']
fig = go.Figure(go.Pie(labels = titanic_age['age_category'], values = titanic_age['count']))
fig.update_traces(hoverinfo='label+percent',
textinfo = 'percent+label', textfont_size=15,
marker = dict(colors = colors, line = dict(color='#000000', width =2)))
fig.update_layout(title = 'Titanic Age Categories', title_x = 0.5)
fig.show()
Purpose: Displays quantitative representation in pie with donut shape.
Question: What is distribution count of google playstore apps content rating?
content = google['Content Rating'].value_counts().to_frame().reset_index().rename(columns = {'index': 'Content Rating', 'Content Rating':'count'})
fig = go.Figure(go.Pie(labels = content['Content Rating'], values = content['count'], hole = 0.9))
fig.update_traces(hoverinfo = 'label+percent', textinfo = 'percent', textfont_size = 15)
fig.update_layout(title = 'Google Apps Content Rating', title_x =0.5)
fig.show()
Purpose: Displays quantitative representation of many categorical variables by size distribution.
Question: Describe the spread among age category, sex and survived people with respect to fare in titanic?
titanic['survived_or_not'] = np.where(titanic['Survived']==1, 'Survived', np.where(titanic['Survived']==0, 'Died', 'null'))
sun_df = titanic[['Sex', 'survived_or_not', 'Embarked',
'age_category', 'Cabin', 'Fare']].groupby(['Sex', 'survived_or_not', 'age_category', 'Cabin', 'Embarked']).agg('sum').reset_index()
sun_df.head(3)
fig = px.sunburst(sun_df, path = ['Sex', 'survived_or_not', 'age_category'], values = 'Fare')
fig.update_layout(title = 'Titanic distribution by Sex, Survived, Age Category', title_x = 0.5)
fig.show()
Purpose: Displays quantitative representation of many categorical variables by size distribution with color gradient
Question: Describe the spread among age category, sex and survived people with respect to fare highlighting the most occured scenario with color gradient in Titanic?
fig = px.sunburst(sun_df, path=['Sex','survived_or_not','age_category'], values='Fare',
color=sun_df['Fare'],
color_continuous_scale='orrd')
fig.update_layout(title="Titanic distribution by Sex, Survived, Age Category",title_x=0.5)
fig.show()
Purpose: Displays quantitative representaion of many categorical variables by size distribution with discrete color
Question: Describe the spread among age category, sex and survived people with respect to fare and highlihgting survival in discrete color?
fig = px.sunburst(sun_df, path = ['Sex', 'survived_or_not', 'age_category'], values = 'Fare',
color = 'survived_or_not', color_discrete_map = {'(?)':'black', 'Died' :'red', 'Survived' :'darkblue'})
fig.update_layout(title = 'Titanic distribution by Sex, Survived, Age Category', title_x = 0.5)
fig.show()
Purpose: Relationship between variables with respective to time
Question: How many COVID deaths were observed over time in Australia?
covid['ObservationDate'] = pd.to_datetime(covid['ObservationDate'])
covid_aus = covid[covid['Country/Region'] == 'Australia'][['ObservationDate', 'Deaths']].groupby(['ObservationDate']).agg('sum').reset_index()
fig = go.Figure(data = go.Scatter(x=covid_aus['ObservationDate'],
y= covid_aus['Deaths'],
mode = 'lines',
marker_color = 'violet'))
fig.update_layout(title = 'Australia Covid cases over time', xaxis_title = 'Date', yaxis_title = 'Number of cases')
fig.show()
Purpose: Relationship between variables with respective to time with range slider
Question: How many COVID deaths were observed over time with range slider in France?
covid_fra = covid[covid['Country/Region']=='France'][['ObservationDate', 'Deaths']].groupby(['ObservationDate']).agg('sum').reset_index()
covid_fra[:3]
fig = go.Figure(data = go.Scatter(x= covid_fra['ObservationDate'],
y = covid_fra['Deaths'],
mode = 'lines',
marker_color = 'darkblue'))
fig.update_xaxes(rangeslider_visible = True)
fig.update_layout(title = 'France COVID cases over time', xaxis_title = 'Date', yaxis_title = 'Number of cases')
fig.show()
Purpose: Relationship between variables with respective to time with custom date range
Question: How many COVID deaths were observed between March and July in Mexico?
covid_mex = covid[covid['Country/Region']=='Mexico'][['ObservationDate', 'Deaths']].groupby(['ObservationDate']).agg('sum').reset_index()
fig = go.Figure(data = go.Scatter(x=covid_mex['ObservationDate'],
y = covid_mex['Deaths'],
mode ='lines',
marker_color ='darkred'))
fig.update_layout(title = 'Mexico COVID cases over time', xaxis_title = 'Date', yaxis_title ='Number of cases', xaxis_range = ['2020-03-01', '2020-06-31'])
fig.show()
Purpose: Display time range for labels
Question: How long did Top 10 ranked kagglers take to become Grandmaster?
# Filtered top 10 ranked competition GM
top_rank = user_achieve[(user_achieve['AchievementType'] == 'Competitions')&(user_achieve['CurrentRanking'] <= 10)]
user_final = user[user['Id'].isin(list(top_rank['UserId']))]
user_final = user_final.rename(columns = {'Id':'UserId'})
user_df = pd.merge(user_final, top_rank, on ='UserId')
user_df['RegisterDate'] = pd.to_datetime(user_df['RegisterDate'])
user_df['TierAchievementDate']= pd.to_datetime(user_df['TierAchievementDate'])
user_df['diff_days'] = (pd.to_datetime(user_df['TierAchievementDate']) - pd.to_datetime(user_df['RegisterDate'])).dt.days
user_df = user_df[['CurrentRanking', 'UserName', 'DisplayName', 'RegisterDate', 'TierAchievementDate', 'diff_days']]
user_df.head(2)
# Need to format dataframe with below names:
# Task (Label Name)
# Start(Start Date)
# Finish (End Date)
#Complete (Continuous variable)
gantt_df = pd.DataFrame({'Task':user_df['DisplayName'], 'Start':user_df['RegisterDate'], 'Finish':user_df['TierAchievementDate'], 'Complete':user_df['diff_days']})
gantt_df
fig = ff.create_gantt(gantt_df,
show_colorbar = False,
showgrid_x = True,
showgrid_y = True
)
fig.update_layout(title = 'Top 10 Ranked Kagglers Duration to Become Grandmaster')
fig.show()
Purpose: Display time range for labels with gradient.
Question: How long did Top 10 ranked kagglers take to become Grandmaster? Also differentiate with the Kagglers who tool most time to less time.
# Scaling the difference days ('Complete')
scaler = MinMaxScaler()
gantt_df_grad = gantt_df.sort_values('Complete', ascending = False)
gantt_df_grad[['Complete']] = scaler.fit_transform(gantt_df_grad[['Complete']])*100
gantt_df_grad
fig = ff.create_gantt(gantt_df_grad, colors = 'Blackbody', index_col = 'Complete',
show_colorbar = True, bar_width = 0.2,
showgrid_x = True, showgrid_y = True)
fig.update_layout(title = 'Top 10 Ranked Kagglers Duration to Become Grandmaster with time gradient')
Purpose: Display distribution of a continuous variable.
Question: How are the score spread for different universities in Germany?
germany_score = world[world['country'] == 'Germany']['score']
germany_score[:2]
fig = go.Figure(go.Box(y=germany_score, name ='Germany Score'))
fig.update_layout(title = 'Distribution of Germany University Scores')
fig.show()
Purpose: Display distribution of a continuous variable for two or more groups
Question: How is the score spread for universities in developing countries - India & Brazil ?
score_brazil = world[world['country'] == 'Brazil']['score']
score_india = world[world['country'] == 'India']['score']
fig = go.Figure()
fig.add_trace(go.Box(y=score_india,
marker_color = 'blue',
name = 'India Score'))
fig.add_trace(go.Box(y=score_brazil,
marker_color = 'red',
name = 'Brazil Score'))
fig.update_layout(title = 'Distribution of University Scores for Developing Countries - India & Brazil')
fig.show()
Purpose: Display distribution of a continuous variable for two or more groups with Mean and Standard Deviation.
Question: How is the rating distribution for playstore app categories- Map & Lifestyle? Highlight with mean and standard Deviation
rating_maps = google[google['Category'] == 'MAPS_AND_NAVIGATION']['Rating']
rating_life = google[google['Category'] == 'LIFESTYLE']['Rating']
fig = go.Figure()
fig.add_trace(go.Box(y=rating_maps,
boxmean=True, # only mean visible on plot
marker_color='green',
name='Maps Apps Rating'))
fig.add_trace(go.Box(y =rating_life,
boxmean = 'sd', # mean and SD visible on plot
marker_color = 'darkorchid',
name = 'Lifestyle Apps Rating'
))
fig.update_layout(title = 'Distribution of Google Playstore App categories - Maps &Lifestyle')
fig.show()
Purpose: Display distribution of a continuous variable for two or more groups with all different boxplot visualization
Question: How is the rating distribution for 4 playstore app categories? Highlight with possible boxpoints for each plot
rating_maps = google[google['Category']=='MAPS_AND_NAVIGATION']['Rating']
rating_life = google[google['Category'] =='LIFESTYLE']['Rating']
rating_tool = google[google['Category']=='TOOLS']['Rating']
rating_business = google[google['Category']=='BUSINESS']['Rating']
fig = go.Figure()
fig.add_trace(go.Box(y=rating_maps, jitter =0.3,
pointpos = -1.8, boxpoints ='all',
marker_color = 'green',
name ='Maps Apps Rating - All points'))
fig.add_trace(go.Box(y=rating_life, boxpoints = False,
marker_color = 'darkorchid',
name = 'Lifestyle Apps Rating - Only Whiskers'))
fig.add_trace(go.Box(y=rating_tool, boxpoints = 'suspectedoutliers',
marker = dict(
color = 'black',
outliercolor = 'black'),
marker_color = 'magenta',
name = 'Tools Apps Rating - Suspected Outliers'))
fig.add_trace(go.Box(y=rating_business, boxpoints = 'outliers',
marker_color = 'chocolate',
name = 'Business Apps Rating - Outliers & Whiskers'))
fig.update_layout(title = 'Distribution of Google Playstore App categories - Maps, Lifestyle, Tools, Business')
fig.show()
Purpose: Display distribution of continuous variable
Question: What is the salary distribution of Computer management graduates?
campus_computer = campus[campus['degree_t']=='Comm&Mgmt'].dropna()['salary']
fig = go.Figure(data = [go.Histogram(x=campus_computer,
marker_color = 'green',
xbins=dict(
start = 200000,
end = 1000000,
size = 10000
))])
fig.show()
Purpose: Display distribution of a continuous variable.
Question: What is the salary distribution of Science & Technology graduates in a normalized manner?
campus_science = campus[campus['degree_t']=='Sci&Tech']['salary']
campus_science.head(1)
fig = go.Figure(data = [go.Histogram(x=campus_science, histnorm='probability',
marker_color = 'orange')]) #To get horizontal plot, change axis - y=campus_computer
fig.update_layout(title = 'Distribution of Salary for Science Graduates',
xaxis_title = 'Salary', yaxis_title ='Counts')
fig.show()
Purpose: Display distribution of a continuous variable for different groups
Question: What is the percentage distribution of Computer and Science graduates in a overlaid manner?
per_com = campus[campus['degree_t'] == 'Comm&Mgmt']['degree_p']
per_sci = campus[campus['degree_t'] =='Sci&Tech']['degree_p']
fig = go.Figure()
fig.add_trace(go.Histogram(x=per_com, marker_color = 'green', name = 'Computer Graduates'))
fig.add_trace(go.Histogram(x=per_sci, marker_color ='orange', name = 'Science Graduates'))
# Overlay both histograms
fig.update_layout(barmode = 'overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity = 0.75)
fig.update_layout(title = 'Distribution of Percentage for Computer &Sciecen Graduates', xaxis_title = 'Percentage', yaxis_title ='Counts')
fig.show()
Purpose: Display distribution of a continuous variable for different groups
Question: What is the percentage distribution of Computer and Science graduates in a stack manner?
fig = go.Figure()
fig.add_trace(go.Histogram(x=per_com, marker_color ='green', name='Computer Graduates'))
fig.add_trace(go.Histogram(x=per_sci, marker_color ='orange', name ='Science Graduates'))
# Stack both histograms
fig.update_layout(barmode = 'stack')
# Reduce opacity to see both histograms
fig.update_traces(opacity = 0.75)
fig.update_layout(title = 'Distribution of Percentage for Computer & Science Graduates', xaxis_title = 'Percentage', yaxis_title = 'Counts')
fig.show()
Purpose: Display distribution of a continuous variable.
Question: What is the price distribution for house with 4 rated condition?
class_1 = house[house['OverallCond']==4]['SalePrice']
class_2 = house[house['OverallCond']==5]['SalePrice']
class_3 = house[house['OverallCond']==6]['SalePrice']
hist_data = [class_1]
group_labels = ['Price Distribution for 4 rated condition houses']
colors = ['blue']
fig = ff.create_distplot(hist_data, group_labels, colors = colors, bin_size = [10000])
fig.show()
Purpose: Display distribution of a continuous variable for multiple categories
Question: What is the price distribution for houses with 4,5 & 6 rated condition?
hist_data = [class_1, class_2, class_3] # Added more distplot
group_labels = ['Price Distribution for 4 rated condition houses', 'Price Distribution for 5 rated condition houses', 'Price Distribution for 6 rated condition houses']
colors = ['blue', 'green', 'orange']
fig = ff.create_distplot(hist_data, group_labels, colors = colors, bin_size = [10000, 10000, 10000])
fig.show()
Purpose: Display distribution of a continuous variable for multiple categories with hist curve instead of bar.
Question: What is the price distribution for houses with 4,5 & 6 rated condition?
hist_data = [class_1, class_2, class_3] # Added more distplot
group_labels = ['Price Distribution for 4 rated condition house', 'Price Distribution for 5 rated condition houses', 'Price Distribution for 6 rated condition houses']
colors = ['blue', 'green', 'orange']
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, colors = colors, bin_size = [10000, 10000, 10000])
fig.show()
Purpose: Display the variability of data and used on graphs to indicate the error
Question: What is the variability or SD of house prices? Common values between high and low interval
fig = go.Figure(data = go.Scatter(x = lot_area['LotArea'], y = lot_area['SalePrice'], mode = 'lines'))
fig.show()
fig = go.Figure(data =go.Scatter(
x=lot_area['LotArea'],
y = lot_area['SalePrice'],
error_y = dict(
type = 'data',
color = 'red',
array = lot_area['SalePrice']/10,
visible = True)
))
fig.update_layout(title = 'Sales Price Vs Area - Symmetric Error Bars', xaxis_title = 'Area', yaxis_title = 'Price')
fig.show()
Purpose: Display the variability of data and used on graphs to indicate the error.
Question: What is the variability or SD of house prices? Different values between high and low interval.
fig = go.Figure(data = go.Scatter(
x = lot_area['LotArea'],
y = lot_area['SalePrice'],
error_y = dict(
type = 'data',
color = 'red',
symmetric = False,
array = lot_area['SalePrice']/100 - 500,
arrayminus = lot_area['SalePrice']/100 + 10000)
))
fig.update_layout(title = 'Sales Price vs Area - Asymmetric Error Bars', xaxis_title ='Area', yaxis_title ='Price')
fig.show()
Purpose: Display the density of two continuous variable
Question: How much dense would be the relationship between college test percentate for interview and degree percentage?
fig = go.Figure(go.Histogram2d(
x = campus['etest_p'],
y = campus['degree_p']
))
fig.update_layout(title = 'Density of Interview Test &Degree Percent Age',
xaxis_title = 'Test Percentage', yaxis_title = 'Degree Percentage')
fig.show()
Purpose: Display the density of two continuous variable with custom bin size
Question: How much dense would be the relationship between college test percentage for interview and degree percentage with custom bin size of 20?
fig = go.Figure(go.Histogram2d(
x = campus['etest_p'],
y = campus['degree_p'],
coloraxis = 'coloraxis',
ybins = {'start':30, 'size' : 20} # 20 bin size on yaxis
))
fig.update_layout(title = 'Density of Interview Test & Degree Percentage with bin size 20',
xaxis_title = 'Test Percentage', yaxis_title = 'Degree Percentage')
fig.show()
Purpose: Display the density of two continuous variable with facet of many categories.
Question: How much dense would be the relationship between Age and Fare price in Titanic? Facet by Sex and Survived
fig = px.density_heatmap(titanic, x = 'Age', y ='Fare', facet_row = 'Survived', facet_col = 'Sex')
fig.update_layout(title = 'Density heatmap of Age vs Fare with Survived and Sex')
fig.show()
Purpose: Display the contour lines of 2D numerical array z, i.e interpolated lines of isovalues of z
Question: How much dense would be the relationship between LotFontage and LotArea with interpolation of SalesPrice?
cond_10 = house[house['OverallQual']==10]
fig = go.Figure(go.Contour(
x=cond_10['LotFrontage'],
y=cond_10['LotArea'],
z=cond_10['SalePrice'],
colorscale ='Electric'
))
fig.update_layout(title = 'Density Contour of house price based on Area and Fontage')
fig.show()
Purpose: Display the contour lines of a 2D numerical array z, i.e interpolated lines of isovalues of z
Question: How much dense would be the relationship between LotFrontage and LotArea with interpolation of SalesPrice? Modify ColorBar
fig = go.Figure(data =
go.Contour(
x = cond_10['LotFrontage'],
y = cond_10['LotArea'],
z = cond_10['SalePrice'],
colorscale = 'gnbu',
colorbar = dict(
title = 'House Price',
titleside = 'right',\
titlefont = dict(
size = 14,
family = 'Arial, sans-serif')
)))
fig.update_layout(title = 'Density Contour of house price based on Area and Frontage')
fig.show()
Purpose: Display the contour and histogram of two continuous values
Question: How much dense would be the relationship between Price and LotArea? Showcase density and histogram of both values
cond_8 = house[house['OverallQual']==8]
x = cond_8['LotArea']
y = cond_8['SalePrice']
fig = go.Figure()
fig.add_trace(go.Histogram2dContour(
x = x,
y = y,
colorscale = 'gray',
reversescale = True,
xaxis = 'x',
yaxis = 'y'
))
fig.add_trace(go.Scatter(
x = x,
y = y,
xaxis = 'x',
yaxis = 'y',
mode = 'markers',
marker = dict(
color = "red", #'rgba(0,0,0,0.3)',
size = 3
)
))
fig.add_trace(go.Histogram(
y = y,
xaxis = 'x2',
marker = dict(
color = "blue", #'rgba(0,0,0,1)'
)
))
fig.add_trace(go.Histogram(
x = x,
yaxis = 'y2',
marker = dict(
color = "blue",# 'rgba(0,0,0,1)'
)
))
fig.update_layout(
autosize = False,
xaxis = dict(
zeroline = False,
domain = [0,0.85],
showgrid = False
),
yaxis = dict(
zeroline = False,
domain = [0,0.85],
showgrid = False
),
xaxis2 = dict(
zeroline = False,
domain = [0.85,1],
showgrid = False
),
yaxis2 = dict(
zeroline = False,
domain = [0.85,1],
showgrid = False
),
height = 600,
width = 600,
bargap = 0,
hovermode = 'closest',
showlegend = False,
title_text="Density Contour of Price and Area for Condition 8 houses",title_x=0.5
)
fig.show()
Purpose: Display the distribution of a continuous variable
Question: How much spread does the Indian university scores have?
ind_score = world[world['country']=='India']
fig = go.Figure(data =go.Violin(y=ind_score['score'],
marker_color = 'blue',
x0 = 'India score'))
fig.update_layout(title = 'Distribution of India Universities score')
fig.show()
Purpose: Display the distribution of a continuous variable with violin and boxplot
Question: How much spread does the Portugal university scores have?
aus_score = world[world['country']=='Portugal']
fig = go.Figure(data = go.Violin(y=aus_score['score'],
box_visible=True, line_color = 'black',
meanline_visible = True, fillcolor = 'lightseagreen',
opacity = 0.6, x0 ='Portugal score'))
fig.update_layout(yaxis_zeroline=False, title = 'Distribution of Portugal Universities score_india')
fig.show()
Purpose: Display the distribution of a multiple continuous variable
Question: How much spread does universities scores from top countries have?
temp_list = []
names = list(world['country'].value_counts()[:10].to_frame().reset_index()['index'])
for i in names:
temp_df=world[world['country']==i]['score']
temp_list.append(temp_df)
final_arr = np.array(temp_list)
colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', 10, colortype = 'rgb')
fig = go.Figure()
for data_line, color, n in zip(final_arr, colors, names):
fig.add_trace(go.Violin(x=data_line, line_color =color, name = n))
fig.update_traces(orientation='h', side = 'positive', width =2, points = False)
fig.update_layout(title = 'Distribution of top countries Scores', xaxis_showgrid = False, yaxis_zeroline=True, height = 800)
fig.show()
Purpose: Display more than one categorical variables distribution in a parallelized view
Question: How deep are the connections linked between survival, age category and sex?
fig = px.parallel_categories(titanic, dimensions = ['survived_or_not', 'age_category', 'Sex'])
fig.update_layout(title = 'Titanic Parallel Categories Diagram')
fig.show()
Purpose: Display more than one categorical variables distribution in a parallelized view
Question: How deep are the connections linked between survival, pclass and sex over age?
fig = px.parallel_categories(titanic, dimensions = ['survived_or_not', 'Pclass', 'Sex'],
color ='Age', color_continuous_scale= px.colors.sequential.Aggrnyl,
labels = {'survived_or_not':'Survived', 'Pclass':'Class'})
fig.update_layout(title ='Titanic Parallel Categories Diagram')
fig.show()
Purpose: Display values in table format
tab_netflix = netflix[:5][['title', 'release_year', 'duration', 'country']]
fig = go.Figure(data = [go.Table(header = dict(values = ['Title', 'Release Year', 'Duration',' Country']),
cells = dict(values = [tab_netflix['title'],tab_netflix['release_year'], tab_netflix['duration'], tab_netflix['country']
]))])
fig.show()
Purpose: Display values in table format with each column and header with different colors
colors = ['lightblue', 'lightpink', 'lightgreen', 'yellow']
fig = go.Figure(data = [go.Table(header = dict(values = ['Title', 'Release Year', 'Duration', 'Country'], line_color = 'white', fill_color = 'gray',
align='center', font= dict(color ='white', size =12)),
cells = dict(values = [tab_netflix['title'], tab_netflix['release_year'], tab_netflix['duration'], tab_netflix['country']], line_color = colors, fill_color = colors, align = 'center', font=dict(color = 'black', size =11))
)])
fig.show()
Purpose: Display values in table format (Figure Factory format)
fig = ff.create_table(tab_netflix)
fig.show()
Purpose: Display values in figure factory table format with custom colors and sizes
colorscale=[[0, '#4d004c'],[.5, '#f2e5ff'],[1, '#ffffff']] # colorscale
fig = ff.create_table(tab_netflix,colorscale=colorscale)
fig.layout.width=1250 # Adjust width layout
# Make text size larger
for i in range(len(fig.layout.annotations)):
fig.layout.annotations[i].font.size = 14
fig.show()
Purpose: Display more than one plot and arrange by row and column
Question: Plot Covid death time series for Canada, Russia, UK, Australia
covid_can=covid[covid['Country/Region']=="Canada"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
covid_rus=covid[covid['Country/Region']=="Russia"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
covid_uk=covid[covid['Country/Region']=="UK"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
covid_aus=covid[covid['Country/Region']=="Australia"][['ObservationDate','Deaths']].groupby('ObservationDate').sum().reset_index()
fig = make_subplots(rows=2, cols=2,
subplot_titles=("Canada Covid Deaths", "Russia Covid Deaths", "UK Covid Deaths", "Australia Covid Deaths")) # Subplot titles
fig.add_trace(
go.Scatter(x=covid_can['ObservationDate'], y=covid_can['Deaths'],name="Canada"),
row=1, col=1
)
fig.add_trace(
go.Scatter(x=covid_rus['ObservationDate'], y=covid_rus['Deaths'],name="Russia"),
row=1, col=2
)
fig.add_trace(
go.Scatter(x=covid_uk['ObservationDate'], y=covid_uk['Deaths'],name="UK"),
row=2, col=1
)
fig.add_trace(
go.Scatter(x=covid_aus['ObservationDate'], y=covid_aus['Deaths'],name="Australia"),
row=2, col=2
)
fig.update_layout(height=600, width=800, title_text="Countries Covid Deaths-Side By Side Subplots")
fig.show()
Purpose: Display more than one plot and arrange by row and column with common x axis
Question: Plot Covid death time series for Canada, Russia, UK, Australia with common x axis?
fig = make_subplots(rows=4, cols=1,
shared_xaxes=True, #change this line to shared_yaxes=True for shared y axes
vertical_spacing=0.02)
fig.add_trace(
go.Scatter(x=covid_can['ObservationDate'], y=covid_can['Deaths'],name="Canada"),
row=1, col=1
)
fig.add_trace(
go.Scatter(x=covid_rus['ObservationDate'], y=covid_rus['Deaths'],name="Russia"),
row=2, col=1
)
fig.add_trace(
go.Scatter(x=covid_uk['ObservationDate'], y=covid_uk['Deaths'],name="UK"),
row=3, col=1
)
fig.add_trace(
go.Scatter(x=covid_aus['ObservationDate'], y=covid_aus['Deaths'],name="Australia"),
row=4, col=1
)
fig.update_layout(height=600, width=800, title_text="Countries Covid Deaths-Shared Axis Subplots")
fig.show()
Purpose: Display more than one plot and arrange by row and column
Question: Plot Netflix Show Counts for US, India, Japan, Canada
net_us=netflix[netflix['country']=='United States']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
net_ind=netflix[netflix['country']=='India']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
net_jap=netflix[netflix['country']=='Japan']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
net_can=netflix[netflix['country']=='Canada']['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'release_year','release_year':'count'})
fig = make_subplots(rows=1, cols=4,subplot_titles=("US Netflix Shows", "India Netflix Shows", "Japan Netflix Shows", "Canada Netflix Shows"),
shared_yaxes=True)
fig.add_trace(
go.Bar(x=net_us['release_year'], y=net_us['count'],name="US", marker=dict(color=net_us['count'], coloraxis="coloraxis1")),
row=1, col=1
)
fig.add_trace(
go.Bar(x=net_ind['release_year'], y=net_ind['count'],name="India", marker=dict(color=net_ind['count'], coloraxis="coloraxis1")),
row=1, col=2
)
fig.add_trace(
go.Bar(x=net_jap['release_year'], y=net_jap['count'],name="Japan",marker=dict(color=net_jap['count'], coloraxis="coloraxis1")),
row=1, col=3
)
fig.add_trace(
go.Bar(x=net_can['release_year'], y=net_can['count'],name="Canada",marker=dict(color=net_can['count'], coloraxis="coloraxis1")),
row=1, col=4
)
fig.update_layout(coloraxis=dict(colorscale='RdBu'), title_text="Countries Netflix show counts -Shared Colorscale Subplots",showlegend=False)
fig.show()
Purpose: Display more than one plot of different types and arrange by row and column
Question: Plot Netflix show counts (Bar chart) & Playstore Apps (Pie chart) together?
categ_apps = google['Category'].value_counts()[:5].to_frame().reset_index().rename(columns = {'index' : 'Category', 'Category' : 'count'})
fig = make_subplots(
rows=1, cols=2,subplot_titles=["Indian Netflix shows by year","Playstore Apps Categories"],
specs=[[{"type": "bar"}, {"type": "pie"}]])
fig.add_trace(go.Bar(x=net_ind['release_year'], y=net_ind['count'],name="India"),row=1, col=1)
fig.add_trace(go.Pie(labels=categ_apps['Category'], values=categ_apps['count']),row=1, col=2)
fig.update_layout(title_text="Multiple Subplots",showlegend=False)
fig.show()